In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [2]:
import pickle

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
orders_df = pd.read_csv("../data/raw/orders.csv")
op_train_df = pd.read_csv("../data/raw/order_products__train.csv")
op_prior_df = pd.read_csv("../data/raw/order_products__prior.csv")

In [5]:
orders_df.head()


Out[5]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 2539329 1 prior 1 2 8 NaN
1 2398795 1 prior 2 3 7 15.0
2 473747 1 prior 3 3 12 21.0
3 2254736 1 prior 4 4 7 29.0
4 431534 1 prior 5 4 15 28.0

Extract features based on the "prior" set


In [6]:
orders_prior_info = orders_df[orders_df.eval_set == "prior"]

In [7]:
orders_prior_info.shape


Out[7]:
(3214874, 7)

In [8]:
prior_uid_pid_df = pd.merge(orders_df[['user_id', 'order_id']], op_prior_df, how = "inner", on = "order_id")

In [9]:
prior_uid_pid_df.shape


Out[9]:
(32434489, 5)

User features

  • number of orders per user

In [10]:
num_orders_per_uid = orders_prior_info.groupby('user_id')['order_id'].\
                                      agg('count').\
                                      reset_index().\
                                      rename(columns = {'order_id': 'num_orders'})

In [11]:
num_orders_per_uid.head()


Out[11]:
user_id num_orders
0 1 10
1 2 14
2 3 12
3 4 5
4 5 4
  • average number of (reordered) products per user

In [12]:
num_product_per_user = prior_uid_pid_df.groupby("order_id").agg({'product_id':'count', 'reordered': 'sum', 'user_id': 'first'}).\
                        reset_index().\
                        rename(columns = {'product_id': 'num_products', 'reordered': 'num_reordered'}).\
                        groupby("user_id").\
                        agg({'num_products': 'mean', 'num_reordered': 'mean'}).\
                        reset_index().\
                        rename(columns = {'num_products': 'avg_num_pids', 'num_reordered': 'avg_num_reordered'})

In [14]:
num_product_per_user['perc_reordered'] = num_product_per_user['avg_num_reordered']/num_product_per_user['avg_num_pids']
num_product_per_user = np.round(num_product_per_user, 2)

In [15]:
num_product_per_user.head()


Out[15]:
user_id avg_num_reordered avg_num_pids perc_reordered
0 1 4.10 5.90 0.69
1 2 6.64 13.93 0.48
2 3 4.58 7.33 0.62
3 4 0.20 3.60 0.06
4 5 3.50 9.25 0.38

Order Temporal features

  • number of orders for each day of week

In [16]:
user_dow = orders_prior_info.groupby(['user_id', 'order_dow'])['order_id'].count().reset_index().\
            rename(columns = {'order_id': 'cnt_dow'}).\
            pivot(index = "user_id", columns = "order_dow", values = "cnt_dow")

In [17]:
user_dow.columns.name = None
user_dow = user_dow.add_prefix('dow_').reset_index().fillna(0)

In [18]:
user_dow.head()


Out[18]:
user_id dow_0 dow_1 dow_2 dow_3 dow_4 dow_5 dow_6
0 1 0.0 3.0 2.0 2.0 3.0 0.0 0.0
1 2 0.0 5.0 5.0 2.0 1.0 1.0 0.0
2 3 6.0 2.0 1.0 3.0 0.0 0.0 0.0
3 4 0.0 0.0 0.0 0.0 2.0 2.0 1.0
4 5 1.0 1.0 0.0 2.0 0.0 0.0 0.0
  • most often shopping day

In [19]:
user_dow['most_dow'] = user_dow.ix[:, 1:].idxmax(axis=1)

In [20]:
user_dow.head()


Out[20]:
user_id dow_0 dow_1 dow_2 dow_3 dow_4 dow_5 dow_6 most_dow
0 1 0.0 3.0 2.0 2.0 3.0 0.0 0.0 dow_1
1 2 0.0 5.0 5.0 2.0 1.0 1.0 0.0 dow_1
2 3 6.0 2.0 1.0 3.0 0.0 0.0 0.0 dow_0
3 4 0.0 0.0 0.0 0.0 2.0 2.0 1.0 dow_4
4 5 1.0 1.0 0.0 2.0 0.0 0.0 0.0 dow_3
  • number of orders by day time (sleeping, morning, noon, afternoon, evening, night)

In [21]:
bins = [0, 6, 10, 13, 17, 20, 24]
group_names = ['sleeping', 'morning', 'noon', 'afternoon', 'evening', 'night']

In [22]:
orders_prior_info['daytime'] = pd.cut(orders_prior_info['order_hour_of_day'], bins = bins, labels= group_names, include_lowest=True)


c:\users\jpzha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [23]:
orders_prior_info.head()


Out[23]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order daytime
0 2539329 1 prior 1 2 8 NaN morning
1 2398795 1 prior 2 3 7 15.0 morning
2 473747 1 prior 3 3 12 21.0 noon
3 2254736 1 prior 4 4 7 29.0 morning
4 431534 1 prior 5 4 15 28.0 afternoon

In [24]:
user_daytime = orders_prior_info.groupby(['user_id', 'daytime'])['order_id'].count().reset_index().\
            rename(columns = {'order_id': 'cnt_daytime'}).\
            pivot(index = "user_id", columns = "daytime", values = "cnt_daytime").fillna(0)

In [25]:
user_daytime.columns.name = None
user_daytime = user_daytime.add_prefix('daytime_').reset_index().fillna(0)
user_daytime.head()


Out[25]:
user_id daytime_sleeping daytime_morning daytime_noon daytime_afternoon daytime_evening daytime_night
0 1 0.0 6.0 1.0 3.0 0.0 0.0
1 2 0.0 8.0 5.0 1.0 0.0 0.0
2 3 0.0 0.0 0.0 9.0 3.0 0.0
3 4 0.0 0.0 4.0 1.0 0.0 0.0
4 5 0.0 0.0 1.0 1.0 2.0 0.0
  • most often shopping daytime

In [26]:
user_daytime['most_daytime'] = user_daytime.ix[:, 1:].idxmax(axis=1)

In [27]:
user_daytime.head()


Out[27]:
user_id daytime_sleeping daytime_morning daytime_noon daytime_afternoon daytime_evening daytime_night most_daytime
0 1 0.0 6.0 1.0 3.0 0.0 0.0 daytime_morning
1 2 0.0 8.0 5.0 1.0 0.0 0.0 daytime_morning
2 3 0.0 0.0 0.0 9.0 3.0 0.0 daytime_afternoon
3 4 0.0 0.0 4.0 1.0 0.0 0.0 daytime_noon
4 5 0.0 0.0 1.0 1.0 2.0 0.0 daytime_evening
  • average of days_since_prior_order per user

In [28]:
user_days_since_po = orders_prior_info.groupby('user_id')['days_since_prior_order'].mean().reset_index().\
                        rename(columns = {'days_since_prior_order': 'avg_days_since_prior_order'})

In [29]:
user_days_since_po.head()


Out[29]:
user_id avg_days_since_prior_order
0 1 19.555556
1 2 15.230769
2 3 12.090909
3 4 13.750000
4 5 13.333333

Product Aggregated statistics features


In [30]:
products_df = pd.read_csv("../data/raw/products.csv")
aisles_df = pd.read_csv("../data/raw/aisles.csv")
departments_df = pd.read_csv("../data/raw/departments.csv")

In [31]:
products_df = pd.merge(pd.merge(products_df, aisles_df, on = "aisle_id"), departments_df, on = "department_id")

In [32]:
products_df.head()


Out[32]:
product_id product_name aisle_id department_id aisle department
0 1 Chocolate Sandwich Cookies 61 19 cookies cakes snacks
1 78 Nutter Butter Cookie Bites Go-Pak 61 19 cookies cakes snacks
2 102 Danish Butter Cookies 61 19 cookies cakes snacks
3 172 Gluten Free All Natural Chocolate Chip Cookies 61 19 cookies cakes snacks
4 285 Mini Nilla Wafers Munch Pack 61 19 cookies cakes snacks

In [33]:
products_df.aisle.nunique()


Out[33]:
134

In [34]:
products_df.department.nunique()


Out[34]:
21

In [35]:
prior_uid_pid_df = pd.merge(prior_uid_pid_df, products_df[["product_id", "aisle_id", "department_id"]],\
                            how = "inner", on = "product_id")

In [36]:
prior_uid_pid_df.head()


Out[36]:
user_id order_id product_id add_to_cart_order reordered aisle_id department_id
0 1 2539329 196 1 0 77 7
1 1 2398795 196 1 1 77 7
2 1 473747 196 1 1 77 7
3 1 2254736 196 1 1 77 7
4 1 431534 196 1 1 77 7
  • most often reordered product

In [37]:
def get_most_reorder_pids(grouped_df, top = 3):
    top3_pids = grouped_df['product_id'].value_counts().nlargest(top).index
    top1 = int(top3_pids[0])
    try: 
        top2 = int(top3_pids[1])
    except:
        top2 = np.nan
    try:
        top3 = int(top3_pids[2])
    except:
        top3 = np.nan
    return pd.Series({'top1_reordered_pid': top1, 
                      'top2_reordered_pid': top2, 
                      'top3_reordered_pid': top3})

In [38]:
%time top3_reordered_pids_user = prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id').apply(get_most_reorder_pids)


Wall time: 3min 16s

In [39]:
top3_reordered_pids_user = top3_reordered_pids_user.reset_index()
top3_reordered_pids_user.head()


Out[39]:
user_id top1_reordered_pid top2_reordered_pid top3_reordered_pid
0 1 12427.0 196.0 10258.0
1 2 32792.0 47209.0 24852.0
2 3 39190.0 47766.0 21903.0
3 4 35469.0 NaN NaN
4 5 26604.0 11777.0 43693.0
  • most often reordered aiesle

In [40]:
# def get_most_reordered_aiesle(grouped_df):
#     top1 = grouped_df['aisle_id'].value_counts().nlargest(1).index[0]
#     return pd.Series({'most_reordered_aiesle':top1})
def get_most_reordered_aiesle(grouped_df):
    top1 = grouped_df['aisle_id'].value_counts().nlargest(1).index[0]
    return top1

In [42]:
%time top1_reordered_aiesle_user = pd.DataFrame.from_records([ (g, get_most_reordered_aiesle(grp))\
                                           for g, grp in prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id')],\
                                        columns = ['user_id', 'most_reordered_aiesle'])


Wall time: 2min 51s

In [43]:
top1_reordered_aiesle_user.head()


Out[43]:
user_id most_reordered_aiesle
0 1 77
1 2 120
2 3 24
3 4 38
4 5 123
  • most often reordered department

In [46]:
def get_most_reordered_dpmt(grouped_df):
    top1 = grouped_df['department_id'].value_counts().nlargest(1).index[0]
    return top1

In [47]:
%time top1_reordered_dpmt_user = pd.DataFrame.from_records([(g, get_most_reordered_dpmt(grp))\
                                           for g, grp in prior_uid_pid_df[prior_uid_pid_df.reordered == 1].groupby('user_id')],\
                                                          columns = ['user_id', 'most_reordered_dpmt'])


Wall time: 2min 40s

In [48]:
top1_reordered_dpmt_user.head()


Out[48]:
user_id most_reordered_dpmt
0 1 19
1 2 16
2 3 4
3 4 1
4 5 4

Product embedding features


In [49]:
import gensim


c:\users\jpzha\appdata\local\programs\python\python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
  • user purchased product frequency vector

In [50]:
prior_uid_pid_cnt_df = prior_uid_pid_df[['user_id', 'product_id']].groupby(['user_id','product_id']).size()

In [51]:
prior_uid_pid_cnt_df = prior_uid_pid_cnt_df.reset_index().rename(columns = {0:'count'})

In [52]:
prior_uid_pid_cnt_df.head()


Out[52]:
user_id product_id count
0 1 196 10
1 1 10258 9
2 1 10326 1
3 1 12427 10
4 1 13032 3

In [53]:
uid_pid_cnt_dict = [{'user_id': k, 'pid_freq_dict': dict(zip(g['product_id'], g['count'])) }\
                    for k,g in prior_uid_pid_cnt_df.groupby('user_id')]
uid_pid_freq_df = pd.DataFrame(uid_pid_cnt_dict)

In [54]:
uid_pid_freq_df.head()


Out[54]:
pid_freq_dict user_id
0 {17122: 1, 41787: 1, 196: 10, 26405: 2, 26088: 2, 39657: 1, 12427: 10, 25133: 8, 35951: 1, 38928: 1, 13032: 3, 10258: 9, 49235: 2, 10326: 1, 13176: 2, 14084: 1, 30450: 1, 46149: 3} 1
1 {38656: 1, 34688: 3, 40198: 2, 45066: 3, 2573: 2, 37646: 1, 44303: 1, 18961: 1, 5907: 1, 24852: 7, 27413: 1, 23: 1, 32792: 9, 30489: 1, 7963: 1, 8479: 1, 46886: 2, 13351: 2, 19240: 1, 22825: 1, 49451: 1, 22829: 1, 20785: 1, 32052: 3, 17758: 2, 41787: 2, 27966: 2, 10305: 1, 32139: 4, 15841: 1, 17224: 1, 5450: 1, 14306: 1, 3151: 1, 48210: 1, 46676: 1, 5322: 1, 27737: 1, 18523: 6, 5212: 1, 4957: 1, 20574: 2, 7781: 3, 42342: 1, 8296: 1, 47209: 8, 19051: 4, 22124: 4, 35917: 1, 22474: 3, 19057: 1, 42356: 1, 13176: 1, 49273: 1, 24954: 1, 40571: 1, 45948: 1, 36735: 3, 21376: 2, 16521: 1, 1559: 6, 47766: 4, 16797: 1, 24990: 1, 9124: 1, 33957: 1, 47526: 1, 13742: 1, 47792: 1, 22963: 2, 21150: 1, 20084: 1, 22559: 1, 30908: 1, 28874: 1, 36287: 1, 24768: 1, 47553: 1, 39877: 1, 8138: 1, 21709: 2, 16589: 5, 27344: 2, 9681: 2, 2002: 4, 19156: 6, 14553: 1, 33754: 5, 79: 1, 45613: 2, 12000: 5, 17872: 3, 12258: 1, 48099: 1, 4071: 1, 21227: 1, 5869: 1, 48110: 2, 47144: 1, 28918: 1, ...} 2
2 {18370: 1, 9387: 5, 17668: 5, 16965: 2, 24010: 2, 44683: 2, 12845: 1, 14992: 2, 21137: 1, 32402: 3, 22035: 3, 8021: 1, 39190: 10, 42265: 1, 38596: 1, 7503: 1, 40604: 1, 16797: 3, 23650: 2, 1819: 3, 15143: 1, 24810: 3, 18599: 4, 1005: 1, 47766: 9, 39922: 1, 49683: 1, 21903: 8, 248: 1, 43961: 4, 48523: 2, 42557: 1, 28373: 2} 3
3 {42329: 1, 26576: 1, 43704: 1, 21573: 1, 17769: 1, 22199: 1, 35469: 2, 37646: 1, 1200: 1, 19057: 1, 2707: 1, 7350: 1, 25623: 1, 7160: 1, 11865: 1, 25146: 1, 36606: 1} 4
4 {11777: 4, 40706: 2, 8518: 2, 28289: 1, 18761: 1, 22475: 1, 27344: 1, 20754: 1, 24535: 3, 6808: 1, 31717: 1, 3376: 1, 13988: 2, 21413: 2, 24231: 1, 16168: 1, 48775: 1, 26604: 4, 43693: 3, 13870: 1, 5999: 1, 21616: 1, 15349: 1} 5

In [203]:
uid_pid_freq_df.shape


Out[203]:
(206209, 2)

In [204]:
# save the product count feature for all users 
with open("../data/processed/user_prods_cnt_dict_features.pickle", "wb") as handle:
    pickle.dump(uid_pid_freq_df, handle)

In [55]:
del uid_pid_cnt_dict, prior_uid_pid_cnt_df
  • calculate the purchase embedding vector

In [56]:
product_emd = gensim.models.Word2Vec.load("../data/interim/product2vec.model")

In [57]:
def get_purchase_embedding(pid_freq_dict_series, product_emd):
    all_uids_purchase_emd = []
    for pfreq_d in pid_freq_dict_series:
        uid_purchase_emd_ls = []
        for pid, freq in pfreq_d.items():
            try:
                temp_vec = product_emd[str(pid)]*freq
                uid_purchase_emd_ls.append(temp_vec)
            except:
                continue
        one_uid_purchase_emd = np.stack(uid_purchase_emd_ls).sum(axis = 0)
        all_uids_purchase_emd.append(one_uid_purchase_emd)
    return np.stack(all_uids_purchase_emd)

In [58]:
uids_purchase_emd = get_purchase_embedding(uid_pid_freq_df['pid_freq_dict'], product_emd)
uid_pid_emb_df = pd.DataFrame(data = uids_purchase_emd, index = uid_pid_freq_df['user_id'])
uid_pid_emb_df = uid_pid_emb_df.add_prefix('emb_')

In [59]:
uid_pid_emb_df = uid_pid_emb_df.reset_index()
uid_pid_emb_df.head()


Out[59]:
user_id emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 emb_6 emb_7 emb_8 ... emb_90 emb_91 emb_92 emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99
0 1 -26.714828 10.802811 -25.158247 0.034250 -13.170386 37.165924 -57.506771 -67.410599 46.844711 ... -9.796089 31.366833 36.130058 23.546530 10.425104 -45.494610 -41.286137 -15.234282 -27.943871 -1.224820
1 2 81.621399 38.889706 82.688820 -102.128090 -19.691454 287.458893 -180.793015 35.621387 -54.345078 ... 120.507309 -108.388176 1.776031 23.358995 -148.381149 88.404236 -56.325516 -31.194061 18.958136 25.864012
2 3 40.915524 -18.381775 75.038582 -1.873290 -58.543594 93.821640 -24.789171 79.996437 -30.540491 ... 27.517372 -92.990982 48.024872 -35.506332 -128.823105 77.397263 16.856283 -16.234632 28.117483 46.908470
3 4 1.429444 2.203788 -0.556785 -11.420183 -6.547275 0.305758 0.160394 -8.673797 -5.231474 ... 3.262614 18.981794 -9.738063 21.261660 2.639111 2.720276 -2.564438 4.853611 -3.863805 9.918034
4 5 7.796177 -11.155602 12.636064 37.787033 11.076339 43.303268 3.458212 15.993218 -27.569876 ... -9.303144 -57.355591 -3.194045 -31.843639 -23.899057 35.830971 18.920025 3.286705 26.969088 18.922802

5 rows × 101 columns

Prepare the training set

joining all the features tables and the train set by user id

All the feature tables:

  • num_orders_per_uid
  • num_product_per_user
  • user_dow
  • user_daytime
  • user_days_since_po
  • top3_reordered_pids_user
  • top1_reordered_aiesle_user
  • top1_reordered_dpmt_user
  • uid_pid_emb_df

train set orders info:

  • orders_df[orders_df.eval_set = "train"]

train set labels info:

  • "../data/interim/train_labels.csv"

In [60]:
X_features = pd.concat([num_orders_per_uid, num_product_per_user, user_dow, user_daytime, user_days_since_po,\
          top3_reordered_pids_user, top1_reordered_aiesle_user, top1_reordered_dpmt_user, uid_pid_emb_df], axis = 1)

In [61]:
X_features.shape


Out[61]:
(206209, 134)

In [62]:
X_features = X_features.loc[:, ~ X_features.columns.duplicated()]

In [63]:
X_features.head()


Out[63]:
user_id num_orders avg_num_reordered avg_num_pids perc_reordered dow_0 dow_1 dow_2 dow_3 dow_4 ... emb_90 emb_91 emb_92 emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99
0 1 10 4.10 5.90 0.69 0.0 3.0 2.0 2.0 3.0 ... -9.796089 31.366833 36.130058 23.546530 10.425104 -45.494610 -41.286137 -15.234282 -27.943871 -1.224820
1 2 14 6.64 13.93 0.48 0.0 5.0 5.0 2.0 1.0 ... 120.507309 -108.388176 1.776031 23.358995 -148.381149 88.404236 -56.325516 -31.194061 18.958136 25.864012
2 3 12 4.58 7.33 0.62 6.0 2.0 1.0 3.0 0.0 ... 27.517372 -92.990982 48.024872 -35.506332 -128.823105 77.397263 16.856283 -16.234632 28.117483 46.908470
3 4 5 0.20 3.60 0.06 0.0 0.0 0.0 0.0 2.0 ... 3.262614 18.981794 -9.738063 21.261660 2.639111 2.720276 -2.564438 4.853611 -3.863805 9.918034
4 5 4 3.50 9.25 0.38 1.0 1.0 0.0 2.0 0.0 ... -9.303144 -57.355591 -3.194045 -31.843639 -23.899057 35.830971 18.920025 3.286705 26.969088 18.922802

5 rows × 126 columns


In [64]:
# save the feature matrix for all users 
with open("../data/processed/all_users_features.pickle", "wb") as handle:
    pickle.dump(X_features, handle)
  • Get the train feature matrix

In [65]:
X_train_orders = orders_df[orders_df.eval_set == "train"]

In [66]:
X_train_orders.shape


Out[66]:
(131209, 7)

In [67]:
X_train_orders.head()


Out[67]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
10 1187899 1 train 11 4 8 14.0
25 1492625 2 train 15 1 11 30.0
49 2196797 5 train 5 0 11 6.0
74 525192 7 train 21 2 11 6.0
78 880375 8 train 4 1 14 10.0

In [68]:
# combine with the train set order info with features info
X_train = pd.merge(X_train_orders, X_features, how = "inner", on = "user_id")
  • join the train label information to keep the order consistent

In [69]:
y_train_df = pd.read_csv("../data/interim/train_labels.csv")

In [70]:
y_train_df.head()


Out[70]:
order_id products
0 1 49302 11109 43633 22035
1 36 19660 43086 46620 34497 48679 46979
2 38 21616
3 96 20574 40706 27966 24489 39275
4 98 8859 19731 43654 13176 4357 37664 34065 35951 43560 9896 27509 15455 27966 47601 40396 35042 40986 1939 46313 329 30776 36695 27683 15995 27344 47333 48287 45204 24964 18117 46413 34126 9373 22935 46720 44479 790 18441 45007 20520 7461 26317 3880 41387 17747

In [71]:
y_train_df[y_train_df.products == "None"].shape


Out[71]:
(8602, 2)

In [183]:
# number of unique reordered pid outcomes
pid_ls_str= ' '.join([pid_ls for pid_ls in y_train_df['products']])
num_reordered_pids = len(set(pid_ls_str.split(' ')))

In [184]:
num_reordered_pids


Out[184]:
30177
  • check the unique pids in the purchase history of users in the test set

In [73]:
test_user_ids = orders_df[orders_df.eval_set=="test"]['user_id'].tolist()

In [115]:
def products_concat(series):
    out = ''
    for product in series:
        if product > 0:
            out = out + str(int(product)) + ' '
    
    if out != '':
        return out.rstrip()
    else:
        return 'None'

In [122]:
test_uid_prev_prods = orders_df[(orders_df.user_id.isin(test_user_ids)) & (orders_df.eval_set=='prior')].merge(op_prior_df, on = "order_id")

In [124]:
test_users_prev_orders_prods = test_uid_prev_prods.groupby('user_id')['product_id'].apply(products_concat).reset_index()

In [131]:
test_users_prev_orders_prods.head()


Out[131]:
user_id product_id
0 3 9387 17668 15143 16797 39190 47766 21903 39922 24810 32402 38596 21903 248 40604 8021 17668 21137 23650 32402 39190 47766 21903 49683 28373 7503 1819 12845 9387 16965 24010 39190 9387 17668 47766 16965 28373 21903 22035 42265 18370 44683 22035 39190 9387 47766 1819 24810 14992 44683 39190 1819 21903 43961 16797 24010 14992 18599 9387 39190 22035 43961 18599 21903 47766 42557 48523 39190 47766 16797 43961 48523 18599 17668 47766 39190 1005 32402 39190 47766 21903 43961 17668 39190 18599 23650 21903 47766 24810
1 4 36606 7350 35469 2707 42329 7160 1200 17769 43704 37646 11865 35469 19057 22199 25146 26576 25623 21573
2 6 27521 38293 21903 48679 38293 20323 40992 21903 45007 11068 10644 49401 25659 8424
3 11 47912 8309 28465 42585 36070 2002 16869 35765 35738 18987 10644 38456 35640 47601 18465 35640 35948 8670 14947 42585 41290 34658 33731 27959 36070 23400 41290 14947 35948 27959 8197 8670 33731 35640 33572 42585 16869 34658 8309 38456 19543 35948 8309 8670 5605 95 19769 14947 12384 28465 26209 38970 10644 42736 27451 20383 30855 44632 43352 44987 27959 35948 8230 30480 22950 34551 44632 15261 33037 20383 35321 27959 40738 41319 28465 8309 14947 34658 40055 24799 17706 33572 27959 48697 49374 8309 30563 5989 41909 17794 13176 4799 33021 21040
4 12 17159 21616 40377 13176 10863 17766 7076 28134 46157 20350 48835 33761 28985 5876 34358 11845 21198 27243 12206 14992 5746 47672 8239 38164 42450 12872 44683 10863 7076 20350 16589 34216 13176 14992 42736 21616 19895 18761 45056 37646 38693 47144 8239 44661 22935 35345 12797 10978 33443 28342 20144 5746 13176 14992 44422 11520 31506 22959 7120 37687 19006 39667 17794 28134 10863 7076 30489 49683 24390 47509 34243 48364 16185 248

In [132]:
test_users_prev_orders_prods.to_csv("../data/interim/test_users_pids_bought.csv")

In [128]:
# number of products for users in the test set 
test_pid_ls_str= ' '.join([pid_ls for pid_ls in test_users_prev_orders_prods['product_id']])
len(set(test_pid_ls_str.split(' ')))


Out[128]:
48457

In [135]:
products_df.product_id.nunique()


Out[135]:
49688

So it seems there are unseen product in the test set. It's better to represent the label to be a 49688 length of indicator vector.


In [136]:
X_train = pd.merge(X_train, y_train_df, how = "inner", on = "order_id")

In [137]:
X_train.head()


Out[137]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order num_orders avg_num_reordered avg_num_pids ... emb_91 emb_92 emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99 products
0 1187899 1 train 11 4 8 14.0 10 4.10 5.90 ... 31.366833 36.130058 23.546530 10.425104 -45.494610 -41.286137 -15.234282 -27.943871 -1.224820 196 25133 38928 26405 39657 10258 13032 26088 49235 46149
1 1492625 2 train 15 1 11 30.0 14 6.64 13.93 ... -108.388176 1.776031 23.358995 -148.381149 88.404236 -56.325516 -31.194061 18.958136 25.864012 22963 7963 16589 32792 41787 22825 24852 45066 5450 22559 45613 33957
2 2196797 5 train 5 0 11 6.0 4 3.50 9.25 ... -57.355591 -3.194045 -31.843639 -23.899057 35.830971 18.920025 3.286705 26.969088 18.922802 15349 21413 40706 21616
3 525192 7 train 21 2 11 6.0 20 6.90 10.30 ... -231.748123 153.659866 -317.609009 -251.143356 128.311203 74.937943 62.291996 195.875809 28.809772 47272 37999 13198 43967 40852 17638 29894 45066
4 880375 8 train 4 1 14 10.0 3 4.33 16.33 ... -77.055519 8.204720 -58.113785 -44.359272 59.807442 53.728695 -11.124917 -11.276837 11.506551 15937 23165 21903 41540

5 rows × 133 columns


In [223]:
X_train[X_train.products == "None"].head()


Out[223]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order num_orders avg_num_reordered avg_num_pids ... emb_91 emb_92 emb_93 emb_94 emb_95 emb_96 emb_97 emb_98 emb_99 products
6 1822501 10 train 6 0 19 30.0 5 9.800000 28.600000 ... -230.211487 5.730721 -106.121880 -142.539398 160.519669 101.908089 -38.074680 69.347435 4.375209 None
23 2436259 44 train 4 0 12 30.0 3 2.333333 12.333333 ... 19.159031 -1.656268 5.061689 28.789291 -2.363226 3.690811 -17.018866 -2.435671 -4.854250 None
25 2906490 47 train 6 4 22 16.0 5 1.000000 5.200000 ... -7.445229 10.061296 -16.668764 8.518276 4.098104 8.319317 -2.216025 -2.315904 -4.365201 None
58 1179185 93 train 15 1 10 5.0 14 0.357143 2.214286 ... 27.313814 -18.379974 11.125583 2.677087 7.727707 -9.927894 -14.212251 -13.137720 3.635296 None
72 3032922 112 train 27 2 18 6.0 26 3.538462 6.076923 ... -123.882759 -23.177948 -42.518337 -171.473160 89.357986 78.872139 35.444054 140.189163 -4.364416 None

5 rows × 132 columns


In [140]:
list(set(products_df.product_id))


Out[140]:
49688

In [173]:
all_outcomes = ['None'] + list(set(products_df.product_id.astype('str')))

In [174]:
len(all_outcomes)


Out[174]:
49689

In [175]:
labels_tuple_ls = [tuple(label.split(' ')) for label in X_train['products']]

In [176]:
labels_tuple_ls[6] #double check


Out[176]:
('None',)

In [177]:
mlb = MultiLabelBinarizer(sparse_output=True)

In [178]:
mlb.fit([all_outcomes])


Out[178]:
MultiLabelBinarizer(classes=None, sparse_output=True)

In [179]:
mlb.classes_


Out[179]:
array(['1', '10', '100', ..., '9998', '9999', 'None'], dtype=object)

In [180]:
y_train = mlb.transform(labels_tuple_ls)

In [181]:
type(y_train)


Out[181]:
scipy.sparse.csr.csr_matrix

In [182]:
y_train.shape


Out[182]:
(131209, 49689)

In [183]:
y_train.data.nbytes # 3.35 MB


Out[183]:
3349704

In [184]:
#save the transformed train lables 
with open("../data/processed/y_train.pickle", "wb") as handle:
    pickle.dump(y_train, handle)
  • reduce the data memory usage

In [185]:
X_train.info(memory_usage= "deep")


<class 'pandas.core.frame.DataFrame'>
Int64Index: 131209 entries, 0 to 131208
Columns: 133 entries, order_id to products
dtypes: float32(100), float64(23), int64(6), object(4)
memory usage: 116.2 MB

In [186]:
# preprocess the data
features = X_train.columns
drop_cols = ['order_id', 'eval_set', 'products']
uint8_cols = features[features.str.startswith("dow_")].tolist() +\
             features[features.str.startswith("daytime_")].tolist() +\
             features[features.str.startswith("order_")].tolist() +\
             features[features.str.startswith("avg_")].tolist() +\
             ["days_since_prior_order", "avg_days_since_prior_order"]
uint8_cols.remove('order_id')
str_cat_cols = features[features.str.startswith("top")].tolist() + \
               features[features.str.startswith("most_reordered_")].tolist() +\
                ['most_daytime', 'most_dow']
round_cols = features[features.str.startswith("emb")].tolist()

In [187]:
def clean_data_type(df, convert_ls, float_round_ls = None, drop_cols_ls = None):
    df = df.copy()
    df = df.drop(drop_cols_ls, axis = 1)
    
    for ct in convert_ls:
        convert_type = ct[0]
        convert_cols = ct[1]
        for col in convert_cols:
            df[col] = df[col].astype(convert_type)
    
    df[float_round_ls] = np.round(df[float_round_ls], 2)
    
    return df

In [188]:
convert_cols = [('uint8', uint8_cols), ('str', str_cat_cols)]
X_train = clean_data_type(X_train, convert_ls = convert_cols, float_round_ls = round_cols, drop_cols_ls = drop_cols)

In [189]:
X_train.info(memory_usage= "deep")


<class 'pandas.core.frame.DataFrame'>
Int64Index: 131209 entries, 0 to 131208
Columns: 130 entries, user_id to emb_99
dtypes: float32(100), float64(1), int64(2), object(7), uint8(20)
memory usage: 112.4 MB

In [191]:
# save the cleaned and transformed training set - ready for modeling
with open("../data/processed/X_train.pickle", "wb") as handle:
    pickle.dump(X_train, handle)

In [192]:
X_train.isnull().sum().sort_values(ascending=False)[0] #no missing values


Out[192]:
0

Prepare the test feature matrix


In [193]:
test_orders = orders_df[orders_df.eval_set == "test"]

In [194]:
test_orders.shape


Out[194]:
(75000, 7)

In [195]:
X_test = pd.merge(test_orders, X_features, how = "inner", on = "user_id")

In [196]:
X_test.shape #double check


Out[196]:
(75000, 132)

In [197]:
drop_cols = ['order_id', 'eval_set']
X_test = clean_data_type(X_test, convert_ls = convert_cols, float_round_ls = round_cols, drop_cols_ls = drop_cols)

In [198]:
X_test.shape


Out[198]:
(75000, 130)

In [199]:
X_test.info(memory_usage="deep")


<class 'pandas.core.frame.DataFrame'>
Int64Index: 75000 entries, 0 to 74999
Columns: 130 entries, user_id to emb_99
dtypes: float32(100), float64(1), int64(2), object(7), uint8(20)
memory usage: 64.3 MB

In [200]:
# save the cleaned and transformed test set - ready for prediction
with open("../data/processed/X_test.pickle", "wb") as handle:
    pickle.dump(X_test, handle)

In [201]:
X_test.isnull().sum().sort_values(ascending=False)[0] #no missing values


Out[201]:
0

In [ ]: